Input data and configuration

# get default parameters the papermill way.
input_file = "results/04_annotate_cell_types/adata.h5ad"
output_file = "results/05_prepare_de/adata.h5ad"
output_file_obs = "results/05_prepare_de/adata_obs.tsv"
results_dir = "results/05_prepare_de"
cpus = 32
# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
output_file_obs = "adata_obs.tsv"
cpus = "32"
results_dir = "."
import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
from collections import OrderedDict
import os
import sys
import gc

sys.path.append("lib")
sys.path.append("../lib")

from jupytertools import setwd, fix_logging, display

from toolz.functoolz import pipe, partial

from multiprocessing import Pool
import seaborn as sns
from plotnine import ggplot, aes
import plotnine as n
import scipy.stats as stats
import itertools

setwd()
fix_logging(sc.settings)
Working directory did not change because calling from nextflow. 
# setup R integration
import rpy2.rinterface_lib.callbacks
import anndata2ri
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
%%R
library(conflicted)
conflict_prefer("Position", "base")
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggbeeswarm)
library(edgeR)
options(max.print=100)
options(repr.matrix.max.cols=50, repr.matrix.max.rows=6)
R[write to console]: [conflicted] Will prefer base::Position over any other package

R[write to console]: Loading required package: magrittr

R[write to console]: Loading required package: limma

markers = pd.read_csv("tables/cell_type_markers.csv")
adata = sc.read_h5ad(input_file)

Analysis of T cells

adata.obs.columns
Index(['batch', 'samples', 'patient', 'origin', 'replicate', 'dataset',
       'tumor_type', 'platform', 'age', 'sex', 'hpv_status', 'ir_status',
       'facs_purity_cd3', 'facs_purity_cd56', 'n_genes', 'mt_frac', 'n_counts',
       'rk_n_counts', 'rk_n_genes', 'rk_mt_frac', 'doublet_score',
       'is_doublet', 'leiden', 'S_score', 'G2M_score', 'phase', 'cell_type',
       'cell_type_unknown', 'cell_type_coarse'],
      dtype='object')
# subset to T cells
adata_all = adata
mask = adata.obs["cell_type_coarse"].isin(["T cell", "NK cell"])
adata = adata[mask, :].copy()
sc.pl.umap(adata, color="cell_type")

Redo neighbors, umap, clustering

adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, raw=adata.raw)
adata.uns["norm_log"] = True
sc.pp.pca(adata)
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)
Note that scikit-learn's randomized PCA might not be exactly reproducible across different computational platforms. For exact reproducibility, choose `svd_solver='arpack'.` This will likely become the Scanpy default in the future.
computing PCA with n_comps = 50
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
sc.pl.umap(adata, color=["samples", "cell_type", "hpv_status", "ir_status"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])

inspect and correct for batch effects

  • Using combat resulted in a blob with a lot of signal lost.
  • The patients are admixed fairly well already (and will improve further after HVG filtering (see below))
  • I don't use combat here therefore.

(using combat with covariates did not work out -> singular matrix, i.e. too few samples per group)

HVG-filtering

  • variable genes reduced to 3000, as we are only dealing with cells of the same major type now.
sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000)
sc.pl.highly_variable_genes(adata)
extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

redo neighbors and clustering after HVG filtering.

  • patient admixture looks a lot better now
  • batch effects appear not to be a major issue.
sc.pp.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)
computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
sc.pl.umap(adata, color=["samples", "cell_type", "hpv_status", "ir_status"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])

Do clustering for all cell types individually

adatas = {
    "CD4": adata[
        adata.obs["cell_type"]
        .isin(["T cell CD4+ non-regulatory", "T cell regulatory"])
        .tolist(),
        :,
    ].copy(),
    "CD8": adata[(adata.obs["cell_type"] == "T cell CD8+").tolist(), :].copy(),
    "NK": adata[(adata.obs["cell_type"] == "NK cell").tolist(), :].copy(),
}
for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    sc.pp.highly_variable_genes(tmp_adata, flavor="cell_ranger", n_top_genes=2000)
    sc.pl.highly_variable_genes(tmp_adata)
###########################
CD4
###########################


extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

###########################
CD8
###########################


extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

###########################
NK
###########################


extracting highly variable genes
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: divide by zero encountered in true_divide
  ) / disp_mad_bin[df['mean_bin'].values].values
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: invalid value encountered in true_divide
  ) / disp_mad_bin[df['mean_bin'].values].values
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    sc.pp.pca(tmp_adata, svd_solver="arpack")
    sc.pp.neighbors(tmp_adata, n_neighbors=10)
    sc.tl.umap(tmp_adata)
    sc.pl.umap(tmp_adata, color=["samples", "cell_type", "hpv_status", "ir_status"])
###########################
CD4
###########################


computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished

###########################
CD8
###########################


computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished

###########################
NK
###########################


computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished

def leiden_with_r(adata_key, r, seed):
    key = "leiden_{:.3f}".format(r)
    sc.tl.leiden(adatas[adata_key], resolution=r, key_added=key, random_state=seed)
    return adatas[adata_key].obs[key]
def test_leiden_thresholds(adata_key, resolutions, seeds, n_cpus):
    """
    Test different leiden thresholds. 
    
    Args:
        adata:
        resolutions: numpy array containing all resolutions to test
        seeds: numpy array containin random seeds (every resolution is tested with every seed) 
        p: multiprocessing.Pool 
    """
    args = list(itertools.product([adata_key], resolutions, seeds))
    #     p = lambda: None
    #     p.starmap = lambda x, a: [x for x in itertools.starmap(x, a)]
    leiden_results = p.starmap(leiden_with_r, args)
    n_clusters = [leiden_results[i].cat.categories.size for i, _ in enumerate(args)]
    # re-arrange results in dataframe and aggregate by mean.
    clusters = {s: dict() for s in seeds}

    for i, (a, r, s) in enumerate(args):
        clusters[s][r] = n_clusters[i]

    clusters_mean = np.mean(pd.DataFrame.from_dict(clusters).values, axis=1)

    return clusters_mean
%%capture
p = Pool(int(cpus))
leiden_thres = {
    "CD4": 0.2,
    "CD8": 0.2,
    "NK": 0.2,
}
# test_leiden_thresholds("NK", resolutions=np.arange(0.1, 0.2, 0.05), seeds=np.arange(0, 3), n_cpus=1)
resolutions = np.arange(0.1, 1.5, 0.05)
seeds = np.arange(0, 10)
for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    clusters_mean = test_leiden_thresholds(ct, resolutions, seeds, 16)
    plt.plot(resolutions, clusters_mean)
    # plt.plot(resolutions, pd.Series(clusters_mean).rolling(3), color="red")
    plt.xlabel("leiden resolution")
    plt.ylabel("#clusters")
    plt.vlines(x=leiden_thres[ct], ymin=0, ymax=plt.ylim()[1], color="grey")
    plt.show()
###########################
CD4
###########################


###########################
CD8
###########################


###########################
NK
###########################


There is no clear plateau... anyway, 10 clusters sounds reasonable.

for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    sc.tl.leiden(tmp_adata, resolution=leiden_thres[ct])
    sc.pl.umap(tmp_adata, color="leiden", legend_loc="on data")
    sc.tl.rank_genes_groups(tmp_adata, groupby="leiden")
    sc.pl.rank_genes_groups(tmp_adata)
###########################
CD4
###########################


running Leiden clustering
    finished

ranking genes
    finished

###########################
CD8
###########################


running Leiden clustering
    finished

ranking genes
    finished

###########################
NK
###########################


running Leiden clustering
    finished

ranking genes
    finished

Write output adata.h5ad

for ct, tmp_adata in adatas.items():
    adata.obs.loc[tmp_adata.obs.index, "cluster"] = [
        "{}_{}".format(ct, x) for x in tmp_adata.obs["leiden"]
    ]
sc.pl.umap(adata, color=["cluster", "cell_type"], legend_loc="on data")
... storing 'cluster' as categorical

adata.write_h5ad(output_file, compression="lzf")
adata.obs.to_csv(output_file_obs, sep="\t")

DE questions

  • T cell clusters: all against all
  • HPV+ vs HPV- (by coarse cell type)
  • IR+ vs IR- (by coarse cell type)
adata_edger = adata.copy()
# edgeR expects raw counts, normalization does not have any effect. We can therefore simply undo log1p
adata_edger.X = np.expm1(adata_edger.X)
hpv_map = {"HPV16+": "hpv_pos", "HPV-": "hpv_neg"}
ct_map = {
    "T cell CD8+": "t_cd8",
    "T cell regulatory": "t_reg",
    "T cell CD4+ non-regulatory": "t_cd4",
    "NK cell": "nk",
}
ir_map = {"IR+": "ir_pos", "IR-": "ir_neg"}


def remap(adata, col, dict_):
    adata.obs[col] = [dict_.get(x, x) for x in adata.obs[col]]


remap(adata_edger, "hpv_status", hpv_map)
remap(adata_edger, "ir_status", ir_map)
remap(adata_edger, "cell_type", ct_map)
adata_edger.obs["cluster"].unique()
[CD4_2, CD4_0, CD8_2, CD8_0, CD8_3, NK_0, CD4_1, CD8_1, NK_2, NK_1]
Categories (10, object): [CD4_2, CD4_0, CD8_2, CD8_0, ..., CD4_1, CD8_1, NK_2, NK_1]
de_dir = results_dir
!mkdir -p {de_dir}
%%R -i adata_edger -i de_dir
adata = adata_edger
dim(adata)
[1]
 16818
 23025

make bulk

As we compare between patients and not between cells, it seems advantageous to create "artificial bulk" samples. If we test on the single cells between patients, expression changes that are driven by a single patient (and might be batch effects) become significant.

The data in adata_edger is normalized and non-log-transformed (the log-transformation was undone above)

bobs = (
    adata_edger.obs.groupby(["patient", "hpv_status", "ir_status"])
    .agg(average_mito=("mt_frac", np.mean), total_reads=("n_counts", np.sum))
    .reset_index()
    .set_index("patient")
    .dropna()
)


bulk_Xs = {}
tmp_cell_types = ["overall", "t_cd8", "t_cd4", "t_reg", "nk"]
for cell_type in tmp_cell_types:
    ct_mask = (
        (adata_edger.obs["cell_type"] == cell_type).values
        if cell_type != "overall"
        else True
    )
    bulk_Xs[cell_type] = OrderedDict()
    for patient in bobs.index:
        patient_mask = (adata_edger.obs["patient"] == patient).values
        bulk_Xs[cell_type][patient] = np.mean(
            adata_edger.X[ct_mask & patient_mask, :], axis=0
        )
        assert bulk_Xs[cell_type][patient].size == adata.shape[1]

# bdata = sc.AnnData(var = adata.var, )
bdatas = []
for cell_type in tmp_cell_types:
    tmp_colnames = np.hstack([n for n in bulk_Xs[cell_type].keys()])
    assert np.all(tmp_colnames == bobs.index), "order of samples matches obs"
    X = np.vstack([s for s in bulk_Xs[cell_type].values()])
    bdatas.append(sc.AnnData(var=adata_edger.var, obs=bobs, X=X))
%%R -i bdatas -i tmp_cell_types
names(bdatas) = tmp_cell_types

T cell clusters

%%R
design = model.matrix(~0 + leiden + patient + n_genes + mt_frac, data=colData(adata))
%%R
#' make contrasts: one against all others.
#' 
#' @param design design matrix
#' @param col_data colData or the SingleCellExperiment object. 
#' @param column column name that is used for the contrasts. Also needs to be 
#'    specified as first variable in the model.matrix.
make_contrasts = function(design, col_data, column) {
    n_clus = length(unique(col_data[[column]]))
    upper_block = matrix(rep(-1/(n_clus-1), n_clus^2), ncol=n_clus)
    diag(upper_block) = rep(1, n_clus)
    lower_block = matrix(rep(0, (ncol(design)-n_clus) * n_clus), ncol=n_clus)
    contrasts = rbind(upper_block, lower_block)
    rownames(contrasts) = colnames(design)
    colnames(contrasts) = colnames(design)[1:n_clus]
    contrasts
}
%%R
# per cell type
adata_bk = adata
cell_types = list("t_cd8", c("t_cd4", "t_reg"), "nk")
lapply(cell_types, function(ct) {
    tmp_adata = adata_bk[, colData(adata_bk)$cell_type %in% ct]
    colData(tmp_adata)$cluster = droplevels(colData(tmp_adata)$cluster)
    print(dim(colData(tmp_adata)))
    design = model.matrix(~0 + cluster + patient + n_genes + mt_frac, data=colData(tmp_adata))
    print(dim(design))
    contrasts = make_contrasts(design, colData(tmp_adata), "cluster")
    adata=tmp_adata
    save(adata, design, contrasts, file=paste0(de_dir, '/cluster_', ct[1], '.rda'), compress=FALSE)
})
adata = adata_bk
[1]
 8663
   30

[1]
 8663
   18

[1]
 11582
    30

[1]
 11582
    17

[1]
 2780
   30

[1]
 2780
   17

HPV+ vs. HPV-

%%R
design = model.matrix(
    ~0 + hpv_status + cell_type + n_genes + mt_frac, data=colData(adata)
)
contrasts = makeContrasts(
    overall=hpv_statushpv_pos - hpv_statushpv_neg, levels=colnames(design)
)
save(adata, design, contrasts, file=paste0(de_dir, "/hpv.rda"), compress=FALSE)
%%R
# per cell type
adata_bk = adata
cell_types = c("t_cd8", "t_cd4", "t_reg", "nk")
lapply(cell_types, function(ct) {
    tmp_adata = adata_bk[, colData(adata_bk)$cell_type == ct]
    design = model.matrix(~0 + hpv_status + n_genes + mt_frac, data=colData(tmp_adata))
    contrasts = makeContrasts(
        overall = hpv_statushpv_pos - hpv_statushpv_neg,
        levels = colnames(design)
    )
    adata=tmp_adata
    save(adata, design, contrasts, file=paste0(de_dir, '/hpv_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk

bulk

%%R
adata_bk = adata
lapply(names(bdatas), function(ct) {
    bdata = bdatas[[ct]]
    design = model.matrix(~0 + hpv_status + average_mito + total_reads, data=colData(bdata))
    contrasts = makeContrasts(
        overall = hpv_statushpv_pos - hpv_statushpv_neg,
        levels = colnames(design)
    )
    adata=bdata
    save(adata, design, contrasts, file=paste0(de_dir, '/bulk_hpv_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk

IR+ vs. IR-

%%R
adata_bk = adata
tmp_adata = adata[, colData(adata)$hpv_status == "hpv_pos"]
design = model.matrix(~0 + ir_status + cell_type + n_genes + mt_frac, data=colData(tmp_adata))
contrasts = makeContrasts(
    overall = ir_statusir_pos - ir_statusir_neg,
    levels = colnames(design)
)
adata = tmp_adata
save(adata, design, contrasts, file=paste0(de_dir, '/ir.rda'), compress=FALSE)
adata = adata_bk
%%R
# per cell type
adata_bk = adata
cell_types = c("t_cd8", "t_cd4", "t_reg", "nk")
lapply(cell_types, function(ct) {
    tmp_adata = adata[, colData(adata_bk)$cell_type == ct & colData(adata_bk)$hpv_status == "hpv_pos"]
    design = model.matrix(~0 + ir_status + n_genes + mt_frac, data=colData(tmp_adata))
    contrasts = makeContrasts(
        overall = ir_statusir_pos - ir_statusir_neg,
        levels = colnames(design)
    )
    adata=tmp_adata
    save(adata, design, contrasts, file=paste0(de_dir, '/ir_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk

bulk

%%R
adata_bk = adata
lapply(names(bdatas), function(ct) {
    bdata = bdatas[[ct]]
    design = model.matrix(~0 + ir_status + average_mito + total_reads, data=colData(bdata))
    contrasts = makeContrasts(
        overall = ir_statusir_pos - ir_statusir_neg,
        levels = colnames(design)
    )
    adata=bdata
    save(adata, design, contrasts, file=paste0(de_dir, '/bulk_ir_', ct, '.rda'), compress=FALSE)
})
adata = adata_bk